import plotly.graph_objects as go
import plotly.io as pio
pio.templates["nike"] = go.layout.Template(
# LAYOUT
layout = {
# Fonts
# Note - 'family' must be a single string, NOT a list or dict!
'title':
{'font': {'family': 'HelveticaNeue-CondensedBold, Helvetica, Sans-serif',
'size':30,
'color': '#333'}
},
'font': {'family': 'Helvetica Neue, Helvetica, Sans-serif',
'size':12,
'color': '#333'},
# Colorways
'colorway': ["#2445ec", '#a4abab'],
# Keep adding others as needed below
#'hovermode': 'x unified'
},
# DATA
data = {
# Each graph object must be in a tuple or list for each trace
'bar': [go.Bar(texttemplate = '%{value:$.2s}',
textposition='outside',
textfont={'family': 'Helvetica Neue, Helvetica, Sans-serif',
'size': 20,
'color': '#FFFFFF'
})]
}
)
title: Assignment 03 author:
- name: Dominique Strong
affiliations:
- id: bu name: Boston University city: Boston state: MA number-sections: true date: today format: html: theme: cerulean toc: true toc-depth: 2 date-modified: today date-format: long execute: echo: false eval: false freeze: true
Data Importing and Preparation
import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id
np.random.seed(42)
pio.renderers.default = "notebook"
# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()
# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")
# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")
#df.printSchema() # comment this line when rendering the submission
#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 25/09/24 03:14:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 25/09/24 03:14:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
# Used to convert columns to float type
df = df.withColumn("SALARY", col("SALARY").cast("float")) \
.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float")) \
.withColumn("SALARY_TO", col("SALARY_TO").cast("float")) \
.withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float")) \
.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))
# Used to create Remote_Group column
df = df.withColumn(
"Remote_Group",
when(col("REMOTE_TYPE") == 3, "Hybrid")
.when(col("REMOTE_TYPE") == 1, "Remote")
.when(col("REMOTE_TYPE") == 2, "Onsite")
.otherwise("Onsite")
)
# For question #3 - groups the education levels into broader categories
df = df.withColumn(
"EDU_GROUP",
when(col("MIN_EDULEVELS") == 99, "Associate or lower")
.when(col("MIN_EDULEVELS") == 0, "Associate or lower")
.when(col("MIN_EDULEVELS") == 1, "Associate or lower")
.when(col("MIN_EDULEVELS") == 2, "Bachelor")
.when(col("MIN_EDULEVELS") == 3, "Master's")
.when(col("MIN_EDULEVELS") == 4, "PhD")
.otherwise("Associate or lower")
)
# Computing the medians
def compute_median(spark_df, col_name):
median_value = spark_df.approxQuantile(col_name, [0.5], 0.01)[0]
return median_value
median_from = compute_median(df, "SALARY_FROM")
median_to = compute_median(df, "SALARY_TO")
median_salary = compute_median(df, "SALARY")
print(f'Medians: SALARY_FROM = {median_from}, SALARY_TO = {median_to}, SALARY = {median_salary}')
[Stage 4:> (0 + 1) / 1]
Medians: SALARY_FROM = 87295.0, SALARY_TO = 130042.0, SALARY = 115024.0
# Removes NAs from median
df = df.fillna({"SALARY_FROM": median_from,
"SALARY_TO": median_to,
"SALARY": median_salary})
df = df.withColumn("Average_Salary", (col("SALARY_FROM") + col("SALARY_TO")) / 2)
export_cols = ['EDUCATION_LEVELS_NAME',
'REMOTE_TYPE_NAME',
'EMPLOYMENT_TYPE_NAME',
'MAX_YEARS_EXPERIENCE',
'Average_Salary',
'SALARY',
'NAICS2_NAME',
'EDU_GROUP',
'REMOTE_GROUP',
'LOT_V6_SPECIALIZED_OCCUPATION_NAME'
]
df_selected = df.select(export_cols)
pdf = df_selected.toPandas()
pdf.to_csv("./data/lightcast_cleaned.csv", index=False)
pdf = df_selected.filter(col("SALARY") > 0).select("NAICS2_NAME", "SALARY").toPandas()
1) Salary Distribution by Industry and Employment Type¶
pdf = df_selected.select("NAICS2_NAME", "SALARY").toPandas()
fig = px.box(
pdf, x="NAICS2_NAME", y="SALARY", title="Salary Distribution by NAICS Name",
color_discrete_sequence=['#2445ec'],
points='outliers'
)
fig.update_layout(template="nike", height=800, width=1100, margin=dict(b=100), xaxis=dict(tickangle=35, automargin=True), xaxis_title="NAICS Name", yaxis_title="Salary (USD)")
#rotate x axis labels
fig.update_xaxes(tickangle=45, tickfont=dict(size=12))
fig.show()
This graph illustrates how salaries vary by each NAIC, where Administrative, Support, Waste Management and Remediation services has the highest salaries compared to all other NAIC's at $500K. Conversely, Unclassified has the lowest salary available at roughly $15K.
While many NAIC's have longer ranges of salaries, such as Information, Public Admin, and Real Estate, NAIC's such as Retail Trade mostly consist of the median salary of $115K, therefore any salaries above or below that are considered outliers.
2) Salary Analysis by ONET Occupation Type (Bubble Chart)¶
salary_analysis = spark.sql("""
SELECT
LOT_OCCUPATION_NAME as `Occupation_Name`,
PERCENTILE(SALARY, 0.5) as `Median_Salary`,
count(*) as `Job_Postings`
FROM job_postings
GROUP BY 1
ORDER BY 3 DESC
""")
salary_df = salary_analysis.toPandas()
salary_df.head()
| Occupation_Name | Median_Salary | Job_Postings | |
|---|---|---|---|
| 0 | Data / Data Mining Analyst | 95250.0 | 30057 |
| 1 | Business Intelligence Analyst | 125900.0 | 29445 |
| 2 | Computer Systems Engineer / Architect | 157600.0 | 8212 |
| 3 | Business / Management Analyst | 93650.0 | 4326 |
| 4 | Clinical Analyst / Clinical Documentation and ... | 89440.0 | 261 |
fig = px.scatter(
salary_df,
x="Occupation_Name",
y="Median_Salary",
size="Job_Postings",
title="Median Salary by Lot Occupation Name",
color='Job_Postings',
labels={"Job_Postings": "Job Postings"},
hover_data={"Occupation_Name": True, "Median_Salary": True, "Job_Postings": True},
size_max=60,
width=1100,
height=600,
color_continuous_scale='Plasma'
)
fig.update_layout(
template="nike",
margin=dict(b=100),
xaxis=dict(tickangle=-30, automargin=True),
xaxis_title="Occupation Name",
yaxis_title="Median Salary (USD)")
fig.show()
This graph shows that Data Mining and Business Intelligence analysts have the highest number of job postings, where the former has a lower median salary compared to the former. Computer Systems Engineer however, has a moderate number of job postings at a higher median salary, whereas Business, Clinical, and Market Research Analyst have the lowest number of job postings at the lowest salaries.
3) Salary by Education Level¶
salary_edu_df = df_selected.filter((col("SALARY") > 0) & (col("MAX_YEARS_EXPERIENCE") > 0)).select("Average_Salary", "MAX_YEARS_EXPERIENCE", "EDU_GROUP", "LOT_V6_SPECIALIZED_OCCUPATION_NAME").toPandas()
salary_edu_df["MAX YEARS EXPERIENCE"] = salary_edu_df["MAX_YEARS_EXPERIENCE"] + np.random.uniform(-0.2, 0.2, size=len(salary_edu_df))
fig = px.scatter(
salary_edu_df,
x="MAX YEARS EXPERIENCE",
y="Average_Salary",
title="Experience vs Salary by Education Level",
color='EDU_GROUP',
labels={"EDU_GROUP": "Education Level"}
)
fig.update_layout(
template="nike",
height=600,
width=1100,
margin=dict(b=100),
xaxis_title="Years of Experience",
yaxis_title="Average Salary (USD)")
fig.update_traces(
marker=dict(line=dict(width=1, color='dark grey'))
)
#rotate x axis labels
fig.update_xaxes(dtick=1, tickfont=dict(size=12))
fig.show()
This graph illustrates how salary varies by education level and experience - after close examination, it is clear that as the years of experience increases, so does the average salary. For example, for all educations levels, when their is more than 3 years of experience, the average salary tends to fall somewhere between $100K and $200K; conversely 1 year of experience average salaries tend to fall more below $100K. Interestingly, a Bachelor degree seems to dominate more salaries in the 6+ years of experience.
4) Salary by Remote Work Type¶
Remote_df = df_selected.filter((col("SALARY") > 0) & (col("MAX_YEARS_EXPERIENCE") > 0)).select("Average_Salary", "MAX_YEARS_EXPERIENCE", "EDU_GROUP", "LOT_V6_SPECIALIZED_OCCUPATION_NAME", "Remote_Group").toPandas()
fig = px.scatter(
Remote_df,
x="MAX_YEARS_EXPERIENCE",
y="Average_Salary",
title="Experience vs Salary by Remote Work Type",
color='Remote_Group',
labels={"Remote_Group": "Remote Work Type"},
)
fig.update_layout(
template="nike",
height=600,
width=1100,
margin=dict(b=100),
xaxis_title="Years of Experience",
yaxis_title="Average Salary (USD)")
fig.update_traces(
marker=dict(line=dict(width=1, color='dark grey'))
)
#rotate x axis labels
fig.update_xaxes(dtick=1, tickfont=dict(size=12))
fig.show()
Similar to the prior graph, this graph compares average salaries across years of experience but now with the type of work situation: onsite, remote, or hybrid. One key callout, the highest average salaries for most (if not all) years of experience is when the work is onsite - onsite workers tend to get paid higher on average compared to remote, whereas hybrid salaries fluctuate the most.